In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

1.Reading in Data and do basic profiling¶

In [2]:
data_df = pd.read_csv("./Shopping_Daten.csv")
data_df.head()
Out[2]:
Unnamed: 0 CustomerID Gender Location Tenure_Months Transaction_ID Transaction_Date Product_SKU Product_Description Product_Category ... Avg_Price Delivery_Charges Coupon_Status GST Date Offline_Spend Online_Spend Month Coupon_Code Discount_pct
0 0 17850.0 M Chicago 12.0 16679.0 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA ... 153.71 6.5 Used 0.1 1/1/2019 4500.0 2424.5 1 ELEC10 10.0
1 1 17850.0 M Chicago 12.0 16680.0 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA ... 153.71 6.5 Used 0.1 1/1/2019 4500.0 2424.5 1 ELEC10 10.0
2 2 17850.0 M Chicago 12.0 16696.0 2019-01-01 GGOENEBQ078999 Nest Cam Outdoor Security Camera - USA Nest-USA ... 122.77 6.5 Not Used 0.1 1/1/2019 4500.0 2424.5 1 ELEC10 10.0
3 3 17850.0 M Chicago 12.0 16699.0 2019-01-01 GGOENEBQ079099 Nest Protect Smoke + CO White Battery Alarm-USA Nest-USA ... 81.50 6.5 Clicked 0.1 1/1/2019 4500.0 2424.5 1 ELEC10 10.0
4 4 17850.0 M Chicago 12.0 16700.0 2019-01-01 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Nest-USA ... 153.71 6.5 Clicked 0.1 1/1/2019 4500.0 2424.5 1 ELEC10 10.0

5 rows × 21 columns

In [3]:
profile = pandas_profiling.ProfileReport(data_df, title='Pandas Profiling Report', explorative=True)
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[3]:

In [4]:
data_df.shape
Out[4]:
(52955, 21)
In [5]:
data_df.isna().sum()
Out[5]:
Unnamed: 0               0
CustomerID              31
Gender                  31
Location                31
Tenure_Months           31
Transaction_ID          31
Transaction_Date        31
Product_SKU             31
Product_Description     31
Product_Category         0
Quantity                31
Avg_Price               31
Delivery_Charges        31
Coupon_Status           31
GST                     31
Date                    31
Offline_Spend           31
Online_Spend            31
Month                    0
Coupon_Code            400
Discount_pct           400
dtype: int64
In [6]:
data_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52955 entries, 0 to 52954
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           52955 non-null  int64  
 1   CustomerID           52924 non-null  float64
 2   Gender               52924 non-null  object 
 3   Location             52924 non-null  object 
 4   Tenure_Months        52924 non-null  float64
 5   Transaction_ID       52924 non-null  float64
 6   Transaction_Date     52924 non-null  object 
 7   Product_SKU          52924 non-null  object 
 8   Product_Description  52924 non-null  object 
 9   Product_Category     52955 non-null  object 
 10  Quantity             52924 non-null  float64
 11  Avg_Price            52924 non-null  float64
 12  Delivery_Charges     52924 non-null  float64
 13  Coupon_Status        52924 non-null  object 
 14  GST                  52924 non-null  float64
 15  Date                 52924 non-null  object 
 16  Offline_Spend        52924 non-null  float64
 17  Online_Spend         52924 non-null  float64
 18  Month                52955 non-null  int64  
 19  Coupon_Code          52555 non-null  object 
 20  Discount_pct         52555 non-null  float64
dtypes: float64(10), int64(2), object(9)
memory usage: 8.5+ MB
In [7]:
data_df.duplicated().any()
Out[7]:
False
In [8]:
data_df.describe()
Out[8]:
Unnamed: 0 CustomerID Tenure_Months Transaction_ID Quantity Avg_Price Delivery_Charges GST Offline_Spend Online_Spend Month Discount_pct
count 52955.000000 52924.00000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52924.000000 52955.000000 52555.000000
mean 26477.000000 15346.70981 26.127995 32409.825675 4.497638 52.237646 10.517630 0.137462 2830.914141 1893.109119 6.652800 19.953382
std 15286.936089 1766.55602 13.478285 8648.668977 20.104711 64.006882 19.475613 0.045825 936.154247 807.014092 3.333664 8.127108
min 0.000000 12346.00000 2.000000 16679.000000 1.000000 0.390000 0.000000 0.050000 500.000000 320.250000 1.000000 10.000000
25% 13238.500000 13869.00000 15.000000 25384.000000 1.000000 5.700000 6.000000 0.100000 2500.000000 1252.630000 4.000000 10.000000
50% 26477.000000 15311.00000 27.000000 32625.500000 1.000000 16.990000 6.000000 0.180000 3000.000000 1837.870000 7.000000 20.000000
75% 39715.500000 16996.25000 37.000000 39126.250000 2.000000 102.130000 6.500000 0.180000 3500.000000 2425.350000 9.000000 30.000000
max 52954.000000 18283.00000 50.000000 48497.000000 900.000000 355.740000 521.360000 0.180000 5000.000000 4556.930000 12.000000 30.000000
In [9]:
data_df.describe(exclude = np.number)
Out[9]:
Gender Location Transaction_Date Product_SKU Product_Description Product_Category Coupon_Status Date Coupon_Code
count 52924 52924 52924 52924 52924 52955 52924 52924 52555
unique 2 5 365 1145 404 21 3 365 48
top F Chicago 2019-11-27 GGOENEBJ079499 Nest Learning Thermostat 3rd Gen-USA - Stainle... Apparel Clicked 11/27/2019 SALE20
freq 33007 18380 335 3511 3511 18126 26926 335 6373

2.Cleaning up Data¶

Since the there are rows with NaN-values and some columns are not needed for further analysis, we remove those.

In [10]:
# first drop unnecessary columns 

data_df.drop(columns = ["Unnamed: 0", "CustomerID", "Coupon_Code", "Product_SKU", "Product_Description", "Transaction_ID"], inplace = True)
In [11]:
data_df.isna().sum()
Out[11]:
Gender               31
Location             31
Tenure_Months        31
Transaction_Date     31
Product_Category      0
Quantity             31
Avg_Price            31
Delivery_Charges     31
Coupon_Status        31
GST                  31
Date                 31
Offline_Spend        31
Online_Spend         31
Month                 0
Discount_pct        400
dtype: int64
In [12]:
data_df.dropna(inplace = True)
In [13]:
data_df.isna().sum()
Out[13]:
Gender              0
Location            0
Tenure_Months       0
Transaction_Date    0
Product_Category    0
Quantity            0
Avg_Price           0
Delivery_Charges    0
Coupon_Status       0
GST                 0
Date                0
Offline_Spend       0
Online_Spend        0
Month               0
Discount_pct        0
dtype: int64
In [14]:
data_df.to_csv("./shopping_data_cleaned_full.csv")
In [ ]: